Introductory Data Science for Innovation (995N1) – Week 9, 22 November 2021
igraph (network analysis) and ggraph (network visualisation) packageslibrary(tidyverse) library(tidytext) library(ggplot2) library(plotly) library(igraph) library(ggraph)
my_text_uni <- read_csv("news_articles_example.csv") %>%
filter(Publication == "The Guardian (London)" |
Publication == "The New York Times" |
Publication == "The Independent (United Kingdom)") %>%
select(id, Title) %>%
unnest_tokens(output = word, input = Title) %>%
anti_join(stop_words) %>%
mutate(word_numeric = as.numeric(word)) %>%
filter(is.na(word_numeric)) %>%
select(-word_numeric)
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
my_text_uni
## # A tibble: 483 × 2 ## id word ## <dbl> <chr> ## 1 36 island ## 2 36 nations ## 3 36 press ## 4 36 cop26 ## 5 36 winds ## 6 101 cop26 ## 7 101 compromise ## 8 101 calamity ## 9 102 cop26 ## 10 102 hundreds ## # … with 473 more rows
my_text_uni <- my_text_uni %>%
mutate(id = paste0("doc", id))
node1 <- my_text_uni %>%
distinct(id) %>%
rename(node = id) %>%
mutate(type = T)
node2 <- my_text_uni %>%
distinct(word) %>%
rename(node = word) %>%
mutate(type = F)
nodes <- bind_rows(node1, node2)
nodes
## # A tibble: 389 × 2 ## node type ## <chr> <lgl> ## 1 doc36 TRUE ## 2 doc101 TRUE ## 3 doc102 TRUE ## 4 doc104 TRUE ## 5 doc105 TRUE ## 6 doc106 TRUE ## 7 doc107 TRUE ## 8 doc108 TRUE ## 9 doc109 TRUE ## 10 doc110 TRUE ## # … with 379 more rows
my_text_uni object, while nodes are in the nodes objectgraph_from_data_frame from the package igraphg_bip <- graph_from_data_frame(my_text_uni, vertices = nodes, directed = F)
g_bip
## IGRAPH feee55f UN-B 389 483 -- ## + attr: name (v/c), type (v/l) ## + edges from feee55f (vertex names): ## [1] doc36 --island doc36 --nations doc36 --press ## [4] doc36 --cop26 doc36 --winds doc101--cop26 ## [7] doc101--compromise doc101--calamity doc102--cop26 ## [10] doc102--hundreds doc102--academics doc102--denounce ## [13] doc102--glasgow doc102--summit doc102--failure ## [16] doc102--call doc102--real doc102--green ## [19] doc102--revolution doc104--alternative doc104--worth ## [22] doc104--thinking doc104--nations doc104--redouble ## + ... omitted several edges
g_bip_vis <- ggraph(g_bip, layout = "stress") + geom_edge_link0(edge_colour = "grey") + geom_node_point(aes(shape = type, color = type), size = 2.5) + theme_graph()
g <- bipartite_projection(g_bip, multiplicity = T, which = F) g <- decompose.graph(g)[[1]] V(g)$size <- degree(g)
g
## IGRAPH 097bb81 UNW- 299 1540 -- ## + attr: name (v/c), size (v/n), weight (e/n) ## + edges from 097bb81 (vertex names): ## [1] island --nations island --press island --cop26 ## [4] island --winds nations--press nations--cop26 ## [7] nations--winds nations--alternative nations--worth ## [10] nations--thinking nations--redouble nations--efforts ## [13] nations--final nations--hours nations--sadiq ## [16] nations--khan press --cop26 press --winds ## [19] cop26 --winds cop26 --compromise cop26 --calamity ## [22] cop26 --hundreds cop26 --academics cop26 --denounce ## + ... omitted several edges
g_vis <- ggraph(g, layout = "stress") + geom_edge_link0(aes(edge_width = weight), edge_colour = "grey66") + geom_node_point(aes(size = size), fill = "lightblue", colour = "white", shape = 21) + geom_node_text(aes(label = name), size = 2.5) + scale_edge_width(range = c(0.5, 3)) + scale_size(range = c(1, 10)) + theme_graph()
w1 <- c(1, 0, 0, 0, 2) w2 <- c(1, 0, 1, 0, 1) w3 <- c(0, 0, 3, 0, 1) lsa::cosine(w1, w2)
## [,1] ## [1,] 0.7745967
lsa::cosine(w1, w3)
## [,1] ## [1,] 0.2828427
my_text_uni <- my_text_uni %>% count(id, word) %>% cast_dtm(id, word, n) my_text_uni
## <<DocumentTermMatrix (documents: 65, terms: 324)>> ## Non-/sparse entries: 483/20577 ## Sparsity : 98% ## Maximal term length: 15 ## Weighting : term frequency (tf)
my_text_uni <- lsa::cosine(as.matrix(my_text_uni)) g_cos <- graph_from_adjacency_matrix(my_text_uni, mode = "undirected", weighted = T) g_cos <- delete.edges(g_cos, which(E(g_cos)$weight < 0.3)) g_cos <- decompose.graph(g_cos)[[1]] g_cos
## IGRAPH 58eaad5 UNW- 281 1562 -- ## + attr: name (v/c), weight (e/n) ## + edges from 58eaad5 (vertex names): ## [1] calamity --calamity calamity --compromise compromise--compromise ## [4] compromise--agreement compromise--change compromise--avoid ## [7] compromise--catastrophic compromise--disaster compromise--global ## [10] compromise--hughes compromise--kate compromise--limits ## [13] compromise--miles compromise--paris compromise--set ## [16] compromise--writes cop26 --cop26 cop26 --final ## [19] cop26 --agreement cop26 --draft cop26 --fossil ## [22] cop26 --fuels cop26 --climate cop26 --deal ## + ... omitted several edges
g_vis <- ggraph(g_cos, layout = "stress") + geom_edge_link0(aes(edge_width = weight), edge_colour = "grey66") + geom_node_point(fill = "lightblue", colour = "white", shape = 21) + geom_node_text(aes(label = name), size = 2.5) + scale_edge_width(range = c(0.01, 0.1)) + scale_size(range = c(1, 10)) + theme_graph()
write_gml) to read it in Gephitidtext package includes several lexicons, which are based on classification of unigrams
get_sentiments("bing")
## # A tibble: 6,786 × 2 ## word sentiment ## <chr> <chr> ## 1 2-faces negative ## 2 abnormal negative ## 3 abolish negative ## 4 abominable negative ## 5 abominably negative ## 6 abominate negative ## 7 abomination negative ## 8 abort negative ## 9 aborted negative ## 10 aborts negative ## # … with 6,776 more rows
get_sentiments("afinn")
## # A tibble: 2,477 × 2 ## word value ## <chr> <dbl> ## 1 abandon -2 ## 2 abandoned -2 ## 3 abandons -2 ## 4 abducted -2 ## 5 abduction -2 ## 6 abductions -2 ## 7 abhor -3 ## 8 abhorred -3 ## 9 abhorrent -3 ## 10 abhors -3 ## # … with 2,467 more rows
get_sentiments("nrc")
## # A tibble: 13,901 × 2 ## word sentiment ## <chr> <chr> ## 1 abacus trust ## 2 abandon fear ## 3 abandon negative ## 4 abandon sadness ## 5 abandoned anger ## 6 abandoned fear ## 7 abandoned negative ## 8 abandoned sadness ## 9 abandonment anger ## 10 abandonment fear ## # … with 13,891 more rows
my_text_uni <- read_csv("news_articles_example.csv") %>%
mutate(all_text = paste(Title, Headline, Hlead, sep = " ")) %>%
select(id, all_text) %>%
unnest_tokens(output = word, input = all_text) %>%
anti_join(stop_words) %>%
mutate(word_numeric = as.numeric(word)) %>%
filter(is.na(word_numeric)) %>%
select(-word_numeric)
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
print(my_text_uni, n = 6)
## # A tibble: 191,069 × 2 ## id word ## <dbl> <chr> ## 1 1 unreal ## 2 1 spectacle ## 3 1 cop26 ## 4 1 cop26 ## 5 1 prepackaged ## 6 1 feel ## # … with 191,063 more rows
my_text_uni_bing <- my_text_uni %>%
inner_join(get_sentiments("bing"))
print(my_text_uni_bing, n = 6)
## # A tibble: 18,108 × 3 ## id word sentiment ## <dbl> <chr> <chr> ## 1 1 unreal positive ## 2 1 indulgence positive ## 3 1 unreal positive ## 4 1 indulgence positive ## 5 1 hard negative ## 6 1 bore negative ## # … with 18,102 more rows
my_text_uni_bing_by_id <- my_text_uni_bing %>% count(id, sentiment) %>% pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) print(my_text_uni_bing_by_id, n = 6)
## # A tibble: 688 × 3 ## id negative positive ## <dbl> <int> <int> ## 1 1 25 15 ## 2 2 7 18 ## 3 3 31 19 ## 4 4 31 19 ## 5 5 28 10 ## 6 6 23 13 ## # … with 682 more rows
g <- my_text_uni_bing_by_id %>% ggplot(aes(id)) + geom_bar(aes(y = positive), stat = "identity", show.legend = FALSE, fill = "blue") + geom_bar(aes(y = -negative), stat = "identity", show.legend = FALSE, fill = "red") + scale_y_continuous(limits = c(-180, +180)) + xlab(label = "Document ID") + ylab(label = "Number of positive and negative words") + geom_hline(yintercept = 0, color = "black", size = 0.2)
my_text_uni_bing_by_count <- my_text_uni_bing %>% count(word, sentiment) %>% ungroup() print(my_text_uni_bing_by_count, n = 6)
## # A tibble: 2,232 × 3 ## word sentiment n ## <chr> <chr> <int> ## 1 abolish negative 2 ## 2 abruptly negative 4 ## 3 absence negative 15 ## 4 absurd negative 3 ## 5 abundance positive 7 ## 6 abundant positive 4 ## # … with 2,226 more rows
g <- my_text_uni_bing_by_count %>%
group_by(sentiment) %>%
top_n(15, n) %>%
ungroup %>%
mutate(sentiment = as.factor(sentiment),
word = reorder_within(word, n, sentiment)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col() +
facet_wrap(~sentiment, nrow = 1, scales = "free_y") +
coord_flip() +
theme(legend.position = "none") +
scale_x_reordered()
my_text_uni_afinn <- my_text_uni %>%
inner_join(get_sentiments("afinn"))
print(my_text_uni_afinn, n = 6)
## # A tibble: 18,823 × 3 ## id word value ## <dbl> <chr> <dbl> ## 1 1 united 1 ## 2 1 hard -1 ## 3 1 chance 2 ## 4 1 save 2 ## 5 1 bore -2 ## 6 1 agreed 1 ## # … with 18,817 more rows
my_text_uni_afinn_by_id <- my_text_uni_afinn %>%
group_by(id) %>%
summarise(sentiment = sum(value)) %>%
mutate(overall_sentiment = case_when(
sentiment > 0 ~ "positive",
sentiment < 0 ~ "negative",
sentiment == 0 ~ "neutral"))
print(my_text_uni_afinn_by_id, n = 6)
## # A tibble: 689 × 3 ## id sentiment overall_sentiment ## <dbl> <dbl> <chr> ## 1 1 -3 negative ## 2 2 34 positive ## 3 3 17 positive ## 4 4 17 positive ## 5 5 -79 negative ## 6 6 -9 negative ## # … with 683 more rows
g <- my_text_uni_afinn_by_id %>% ggplot(aes(x = id, y = sentiment, fill = overall_sentiment)) + geom_bar(stat = "identity") + scale_y_continuous(limits = c(-250, +250)) + xlab(label = "Document ID") + ylab(label = "Sentiment score") + geom_hline(yintercept = 0, color = "black", size = 0.2) + theme(legend.position = "bottom")
my_text_uni_nrc <- my_text_uni %>%
inner_join(get_sentiments("nrc"))
print(my_text_uni_nrc, n = 6)
## # A tibble: 83,912 × 3 ## id word sentiment ## <dbl> <chr> <chr> ## 1 1 spectacle negative ## 2 1 spectacle positive ## 3 1 spectacle negative ## 4 1 spectacle positive ## 5 1 united positive ## 6 1 united trust ## # … with 83,906 more rows
my_text_uni_nrc_by_id <- my_text_uni_nrc %>% count(sentiment) %>% ungroup() print(my_text_uni_nrc_by_id, n = 6)
## # A tibble: 10 × 2 ## sentiment n ## <chr> <int> ## 1 anger 4722 ## 2 anticipation 8691 ## 3 disgust 2577 ## 4 fear 7824 ## 5 joy 5970 ## 6 negative 11069 ## # … with 4 more rows
g <- my_text_uni_nrc_by_id %>% ggplot(aes(x = sentiment, y = n, fill = sentiment)) + geom_bar(stat = "identity") + xlab(label = "Sentiment") + ylab(label = "Frequency") + theme(legend.position = "none")
stringr package provides a number of functions to work with patterns of texts and regular expressionstringr cheat sheatstringr cheat sheatmy_text <- c("brighton and hove sussex",
"sussex university",
"sussex science policy research unit",
"research impact sussex")
str_view(my_text, "sussex")
my_text <- c("brighton and hove sussex",
"sussex university",
"sussex science policy research unit",
"research impact sussex")
str_view(my_text, "^sussex")
my_text <- c("brighton and hove sussex",
"sussex university",
"sussex science policy research unit",
"research impact sussex")
str_view(my_text, "sussex$")
str_viewmy_text <- c("Parts from the same car manufacturer areinterchangeable",
"Interchangeability is a principle of mass production of cars")
my_regex <- regex("[:alpha:]*interchang[:alpha:]*", ignore_case = T)
str_extract(my_text, my_regex)
## [1] "areinterchangeable" "Interchangeability"
my_text <- c("Parts from the same car manufacturer areinterchangeable",
"Interchangeability is a principle of mass production of cars")
my_regex <- regex("[:alpha:]*car[:alpha:]*", ignore_case = T)
str_extract(my_text, my_regex)
## [1] "car" "cars"
my_text <- c("That book has been digitalized",
"Access to digitalised books has increased")
my_regex <- regex("digitali[z|s]e[:alpha:]*", ignore_case = T)
str_extract(my_text, my_regex)
## [1] "digitalized" "digitalised"
my_text_uni <- read_csv("news_articles_example.csv") %>%
mutate(article_text = paste(Title, Headline, Hlead, sep = " ")) %>%
select(id, article_text) %>%
unnest_tokens(output = word, input = article_text) %>%
anti_join(stop_words) %>%
mutate(word_numeric = as.numeric(word)) %>%
filter(is.na(word_numeric)) %>%
select(-word_numeric) %>%
count(id, word) %>%
cast_dtm(id, word, n)
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
my_text_uni
## <<DocumentTermMatrix (documents: 692, terms: 23633)>> ## Non-/sparse entries: 133282/16220754 ## Sparsity : 99% ## Maximal term length: 40 ## Weighting : term frequency (tf)
seed for reproducibility purposes)topicmodelslibrary(topicmodels) my_topics <- LDA(my_text_uni, k = 4, control = list(seed = 2020)) my_topics <- tidy(my_topics, matrix = "beta")
my_topics
## # A tibble: 94,532 × 3 ## topic term beta ## <int> <chr> <dbl> ## 1 1 40,000 1.89e- 5 ## 2 2 40,000 5.20e- 11 ## 3 3 40,000 1.29e-141 ## 4 4 40,000 4.92e- 5 ## 5 1 absence 2.76e- 5 ## 6 2 absence 9.15e- 5 ## 7 3 absence 2.74e- 9 ## 8 4 absence 1.44e- 4 ## 9 1 absent 1.51e- 4 ## 10 2 absent 5.95e- 5 ## # … with 94,522 more rows
g <- my_topics %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered() + theme(legend.position = "none")
LDAvis package available hereGroup 1
Adebisi, Jongho, Maria, Keiho
Group 2
Charunan, Poojani, Abdul, Satoshi
Group 3
Oscar, Tsukumo, Jiyoung, Nicholas
Group 4
Alessandro, Shaunna, Jonathan, Rachel
Blei, David M. 2012. “ Probabilistic topic models.” Communications of the ACM 55 (4): 77–84. https://doi.org/10.1145/2133806.2133826.
Blei, David M, Blei@cs Berkeley Edu, Andrew Y Ng, Ang@cs Stanford Edu, Michael I Jordan, and Jordan@cs Berkeley Edu. 2003. “ Latent Dirichlet Allocation.” Journal of Machine Learning Research 3: 993–1022. https://doi.org/10.1162/jmlr.2003.3.4-5.993.
Bone, Frederique, and Daniele Rotolo. 2020. “ Text mining historical sources to trace technological change. The case of mass production.” Working Paper.
Callon, M., J. P. Courtial, and F. Laville. 1991. “ Co-word analysis as a tool for describing the network of interactions between basic and technological research: The case of polymer chemsitry.” Scientometrics 22 (1): 155–205. https://doi.org/10.1007/BF02019280.
Ciarli, Tommaso, and Ismael Ràfols. 2019. “ The relation between research priorities and societal demands: The case of rice.” Research Policy 48 (4): 949–67. https://doi.org/https://doi.org/10.1016/j.respol.2018.10.027.
Kwartler, Ted. 2017. Text Mining in Practice with R. Chichester, United Kingdom: John Wiley & Sons Ltd. https://doi.org/10.1002/9781119282105.